---
title: "SWD_All_Code_for_Paper"
author: "Omar Hammoud Gallego, Roberto Foa, Xavier Romero-Vidal"
date: "14/10/2022"
output: pdf_document
---

```{r setup and upload packages, include=FALSE, results = FALSE}
library(tm)
library(tidytext)
library(dplyr)
library(SnowballC)
library(ggplot2)
library(textdata)
library(topicmodels)
library("tidyverse")
#install.packages("Twitmo")
library("Twitmo")


library(quanteda)

library(sjlabelled)

#install.packages("reshape")
library(reshape)
library(tidyverse)
library(gdata)

#install.packages("DataCombine")
library("DataCombine")
library(lubridate)


#install.packages("dotwhisker")
library("dotwhisker")
library(lme4)

#install.packages("parameters")  # to calculate ci_method="wald"
library("parameters")

#install.packages("stm")
library("stm")

#install.packages("see")  # Set of packages to visualise models
library("see")
#rm(list=ls()) to clean environment
```

```{r setup, include=FALSE}
knitr::opts_chunk$set(
	echo = FALSE,
	message = FALSE,
	warning = FALSE
)
```


```{r, upload data}
#Tweets <- read.csv("/Users/sarahjewett/Library/Mobile Documents/com~apple~CloudDocs/_Satisfaction_Democracy/All_Tweets_Data.csv", stringsAsFactors = FALSE)

#Tweets<- read.csv("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/All_Tweets_Data.csv")

# Omar Upload. COmment Out to run your own, 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Xavi Upload
#setwd("/Volumes/GoogleDrive/Other computers/My Laptop/Documents/_University/2021 Cambridge/_Project/_Tracker/Data/Tweets")

Tweets<- read.csv("All_Tweets_Data.csv")
```

# Topic Modelling
## Quanteda, corpus - tokens - document-feature matrix 

```{r, to assign id to tweets}

Tweets$document <- 1:nrow(Tweets)


Tweets$document<- as.factor(Tweets$document)

```

```{r, as corpus, echo=FALSE}


# Fix date format to day
Tweets$created_at <- as.Date(as.character(as.POSIXct(Tweets$created_at)))
# Newspaper as factor
Tweets$newspaper<- as.factor(as.character(Tweets$newspaper))

# DF as corpus
corp_Tweets <- corpus(Tweets)

toks_Tweets <- corp_Tweets %>%
  tokens(remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url =TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("en")) %>%
  tokens_remove(c("new","s", "rt", "amp", "via", "fe0f", "0001f947", "27a1", "opinion", "now", "two", "will", "0001f447", "0001f3a5", "0001f602", "0001f60d", "can", "get", "like", "one", "three", "live", "bbc","want","time","moment") , valuetype="regex") %>%
  tokens_remove(pattern = "^[#@].+$", valuetype = "regex")

dfm_Tweets <- dfm(toks_Tweets) %>%
  dfm_trim(min_termfreq = 10) 

dfm_Tweets <- dfm_keep(dfm_Tweets, min_nchar= 2) 
```

## Some basic analysis

```{r, prepare data as dtm, echo=FALSE}

topfeatures(dfm_Tweets, 40)

tweets_dfm_hashtags <- dfm_select(dfm_Tweets, pattern = "#*")

topfeatures(tweets_dfm_hashtags, 30)

dfm_Tweets_tm <- convert(dfm_Tweets, to = "topicmodels")
#head(dfm_Tweets)
```


## Select Numer of Topics for LDA Model

```{r, select optimal number of Topics, eval = TRUE}

#library("parallel")
#detectCores() 
# My computer has 12 cores that I will use to run the topic models.


# Be warned, it is very computing intensive. Use different CPUs

#install.packages("ldatuning")
library(ldatuning)

t1 <- Sys.time() # Start timer


result <- FindTopicsNumber(
  dfm_Tweets_tm,
  topics = seq(from = 7, to = 25, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 12,
  verbose = TRUE
)


t2 <- Sys.time() # End timer
t2 - t1


FindTopicsNumber_plot(result)


# It seems like 9/12/14 are the best number of topics, when running all tweets together.

```


## All Tweets 

```{r, All Tweets LDA with 19 topics}


t1 <- Sys.time() # Start timer

all_Tweets_lda <- LDA(dfm_Tweets_tm,
            k = 19, 
            method = "Gibbs",
            control = list(alpha = 0.02, delta = 0.02))

t2 <- Sys.time() # End timer
t2 - t1

```

```{r, show LDA All Tweets}

all_tweets_topics <- tidy(all_Tweets_lda, matrix = "beta")

all_tweets_topics_top_terms <- all_tweets_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

ALL_Tweets_Topics<- all_tweets_topics_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered() +
  ggtitle("All Tweets")



setwd("/Users/HammoudG/Documents/GitHub/Satisfaction_Democracy/data-viz/")
ggsave("All_Tweets_Topics_Top_15_14_Topics.png",ALL_Tweets_Topics, width = 16, height= 9)
#setwd("/Users/xavierromero/Documents/GitHub/Satisfaction_Democracy/data-viz")
#ggsave("All_Tweets_Topics_Top_13_27Dec21.png",ALL_Tweets_Topics)


```

```{r, assign Topic to ALL tweets DONE}

all_tweets_gamma<- tidy(all_Tweets_lda, matrix = "gamma")


all_tweets_gamma$twitter_id<- str_split_fixed(all_tweets_gamma$document, "text", 2)[,2]

# transform in factor
all_tweets_gamma$twitter_id<- as.factor(as.character(all_tweets_gamma$twitter_id))
all_tweets_gamma$topic<-as.factor(all_tweets_gamma$topic)

# See if number agrees with number of tweets. 

all_tweets_gamma_1<- all_tweets_gamma %>%
  select(twitter_id, topic,gamma) %>%
  group_by(twitter_id) %>%
  top_n(1,gamma) %>%
  ungroup()


All_Tweets_Classified_14<- left_join(Tweets, all_tweets_gamma_1, by = c("document"= "twitter_id"))


```

```{r, rename Topics for all tweets, eval =FALSE}

#Check_Topic_Modelling<- All_Tweets_Classified_14 %>% filter(topic == '17')

All_Tweets_Classified_14<- All_Tweets_Classified_14 %>%
  mutate(topic =  fct_recode(topic, 
                             "Uk_Politics" = "1", 
                             "US_Politics"= "2",
                             "Daily_News" = "3", 
                             "Crime" = "4", 
                             "Violence"= "5",
                             "Covid" = "6",
                             "Puppy_News" = "7", 
                             "Enterteinment"= "8", 
                             "Climate_Change" = "9",
                             "Enterteinment" = "10",
                             "Daily_News" = "11",
                             "Daily_News" = "12",
                             "Royal_Family" = "13",
                             "Weather" = "14",
                             "Daily_News" = "15",
                             "Sports" = "16",
                             "Covid" = "17",
                             "Crime" = "18",
                             "Covid" = "19"))

library(gdata)

All_Tweets_Classified_14<- All_Tweets_Classified_14 %>%
  filter(topic != "NA") %>%
  drop.levels()

summary(All_Tweets_Classified_14$topic)

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")
#All_Tweets<- read.csv("All_Tweets_Classified_January_17_2022_14topics.csv")

write.csv(All_Tweets_Classified_14, "All_Tweets_Classified_January_17_2022_14topics.csv")

library(dplyr)

All_Tweets_Classified_14_ONE<- All_Tweets_Classified_14 %>%
    dplyr::select(text, topic) %>%
    dplyr::filter(topic =="3")

#summary(as.factor(All_Tweets$topic))
```


# Sentiment Analysis

```{r, upload tweets Omar Only}

# Omar Upload. Comment Out to run your own, 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:\Users\omarh\OneDrive - London School of Economics\Satisfaction_Democracy_Project")

# Here to upload data

Tweets<- read.csv("All_Tweets_Classified_January_17_2022_14topics.csv", encoding = "UTF-8")

```

```{r, upload tweets Xavi Only, eval = FALSE}

#Xavi Upload

#setwd("G:/Other computers/My Laptop/Documents/_University/2021 Cambridge/_Project/_Tracker/Data/Tweets")

setwd("/Volumes/GoogleDrive/Other computers/My Laptop/Documents/_University/2021 Cambridge/_Project/_Tracker/Data/Tweets")


# Here to upload data

SS_CS_Tweets<- read.csv("All_Tweets_Classified_Sentiment.csv", stringsAsFactors = FALSE, encoding = "UTF-8")


#SS_CS_Tweets_Crime<- SS_CS_Tweets %>% filter(topic == as.factor("Love")) #%>% filter(newspaper.x == "daily_mail")



CS_Tweets<- read.csv("Combined_Survey_Tweets.csv", stringsAsFactors = FALSE, encoding = "UTF-8")


```

# Frequency Topic

```{r, prepare data for plot frequency topics by newspaper}

Tweets$newspaper<- as.factor(as.character(Tweets$newspaper))
Tweets$topic<- as.factor(as.character(Tweets$topic))


# to transform character into date
Tweets$created_at <- strptime(Tweets$created_at, format = "%Y-%m-%dT%H:%M:%OS", tz = "GMT")

# Posixct into date
Tweets$created_at <- as.Date(as.character(as.POSIXct(Tweets$created_at)))


Tweets_P<- Tweets %>%
    group_by(newspaper, topic, created_at) %>%
    summarise(n=n()) %>%
    group_by(created_at,newspaper) %>%
    mutate(perc = n/sum(n)) %>%
    ungroup()

library(lubridate)

Tweets_P<- Tweets_P %>%
   group_by(week =floor_date(created_at, "7 days"), topic) %>%
    mutate(perc_week_topic = mean(perc))

```

```{r, plot frequency topic}

TOPIC_TWEET<- ggplot(Tweets_P, aes(created_at, perc_week_topic, col = topic, fill= topic)) +
    geom_area(alpha = 0.9) +
    facet_wrap(~ newspaper)+
    labs(title= "Topics Tweeted by Newspaper (Weekly average)", x = "Date", 
         y = "Percentage of Tweets each week")

setwd("/Users/HammoudG/Documents/GitHub/Satisfaction_Democracy/data-viz")

ggsave("Topic_Tweets_17_January.png", TOPIC_TWEET, width= 16, height = 9)
```

```{r, prepare tweets df}

# Select only text of tweets
Tweets_Text<- Tweets %>%
    select(text, newspaper, created_at, X)


# to transform character into date
Tweets_Text$created_at <- strptime(Tweets_Text$created_at, format = "%Y-%m-%dT%H:%M:%OS")

# Posixct into date
Tweets_Text$created_at <- as.Date(as.character(as.POSIXct(Tweets_Text$created_at)))





```

```{r, unnest tokens tweets}



# Get rid of punctuation, and other non-content text items
Tweets_Text$text<-gsub("https\\S*", "", Tweets_Text$text) 
Tweets_Text$text<-gsub("@\\S*", "", Tweets_Text$text) 
Tweets_Text$text<-gsub("amp", "", Tweets_Text$text) 
Tweets_Text$text<-gsub("[\r\n]", "", Tweets_Text$text)
Tweets_Text$text<- gsub("[[:punct:]]", "", Tweets_Text$text)


# Set the text to lowercase
Tweets_Text$text <- tolower(Tweets_Text$text)

tok_tweets<- Tweets_Text %>%
    unnest_tokens(word, text) %>%
    anti_join(stop_words) %>% # no stemming necessary in sentiment analysis, as words with same root can have different sentiments
    filter(word != "rt")
    
```

```{r, give sentiment to all tweets}

# Give a value to all positive and negative words
result <- tok_tweets %>% 
    inner_join(get_sentiments("afinn")) %>%
    group_by(X, newspaper, value) 
    
# Average value of sentiment per tweet
result<- result %>%
    group_by(newspaper, created_at, X) %>%
    summarise(value = sum(value)) %>%
    ungroup()
  
library(lubridate)
# create average value of sentiment per week per newspaper
result<- result %>%
   group_by(week =floor_date(created_at, "7 days"), newspaper) %>%
    mutate(weekly_mean_value_sentiment = mean(value)) %>%
    ungroup()

```

```{r, join with main sentiment per week}

# Join Sentiment with dataset with topic
All_Tweets_Classified_Sentiment<- left_join(Tweets, result, by = c("X"))

# Replace NA with 0
All_Tweets_Classified_Sentiment$value[is.na(All_Tweets_Classified_Sentiment$value)] = 0


# Remove duplicate columns
All_Tweets_Classified_Sentiment <- All_Tweets_Classified_Sentiment %>%
    select(-newspaper.y, -created_at.y, -document, -X.1)



```

```{r, Save dataset with topic and sentiment, eval =FALSE}


setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

write.csv(All_Tweets_Classified_Sentiment, "All_Tweets_Classified_Sentiment.csv")

```


# Upload Tweets with Topics and Sentiment 

```{r, Upload data produced up here of All_Tweets_Classified_Sentiment}

# Omar Upload. Comment Out to run your own, 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")


# Here to upload data

CS_Tweets<- read.csv("All_Tweets_Classified_Sentiment.csv", stringsAsFactors = FALSE, encoding = "UTF-8")


library(dplyr)

CS_Tweets$topic<- as.factor(as.character(CS_Tweets$topic))

#CS_Tweets_1 <- CS_Tweets %>% select(text, newspaper.x, topic) %>%
#    filter(topic == "UK_Government_Pandemic_Response")


#summary(CS_Tweets$topic)


# Rename Variables
#CS_Tweets <- dplyr::rename(CS_Tweets,  sentiment_value = value, newspaper = newspaper.x)

#setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

#write.csv(CS_Tweets, "All_Tweets_Classified_Sentiment.csv")

```


# Averaging of tweets per daily analysis


```{r, summary tweets by day}

# to transform character into date
CS_Tweets$created_at.x <- strptime(CS_Tweets$created_at.x, format = "%Y-%m-%d")

# Posixct into date
CS_Tweets$created_at.x <- as.Date(as.character(as.POSIXct(CS_Tweets$created_at.x)))


# Select only text of tweets
Tweets_S<- CS_Tweets %>%
    dplyr::select(newspaper, created_at.x, public_metrics.retweet_count, topic,sentiment_value)

# as factors
Tweets_S$topic<- as.factor(as.character(Tweets_S$topic))
Tweets_S$newspaper<- as.factor(as.character(Tweets_S$newspaper))


# Relative Salience (% of tweets)
Gr_Tweets<- Tweets_S %>%
    group_by(newspaper, created_at.x,topic) %>%
    summarise(n= n()) %>%
    mutate(perc = 100*n/sum(n)) 

# Mean sentiment
Gr_Sent_Tweets<- Tweets_S %>%
    group_by(newspaper, created_at.x, topic) %>%
    summarise(mean_sentiment_value = mean(sentiment_value))  #%>% Mean of sentiment values
    #summarise(sentiment_value = sum(sentiment_value)) 
    # This is to calculate the total sum of sentiment values

Gr_All_Tweets<- left_join(Gr_Tweets, Gr_Sent_Tweets, 
                          by= c("newspaper", "created_at.x", "topic"))

# Weighted mean sentiment
#Gr_WSent_Tweets<- Gr_All_Tweets %>%
#    group_by(newspaper, created_at.x) %>%
#    summarise(wmean_sentiment_value = weighted.mean(mean_sentiment_value, perc))  

# Reach (Sum of RTs)
#Gr_Ret_Tweets<- Tweets_S %>%
#  group_by(newspaper, created_at.x, topic) %>%
#  summarise(tot_retweets = sum(public_metrics.retweet_count))


#Gr_All_Tweets<- left_join(Gr_All_Tweets, Gr_WSent_Tweets, 
#                          by = c("newspaper", "created_at.x"))


Gr_All_Tweets_Retw<- CS_Tweets %>% 
  group_by(newspaper, created_at.x, topic) %>%
  summarise(retweets= sum(public_metrics.retweet_count))

Gr_All_Tweets<- left_join(Gr_All_Tweets, Gr_All_Tweets_Retw, 
                          by = c("newspaper", "created_at.x", "topic"))


# TRANFORM ALL RETWEETS BBC TO ZERO
  
Gr_All_Tweets$retweets[Gr_All_Tweets$newspaper=="bbc_news"] <- 0






#Gr_All_Tweets<- left_join(Gr_All_Tweets, Gr_Ret_Tweets, 
#                          by = c("newspaper", "created_at.x", "topic"))

# TO REVIEW LATER ON TO ADD MEAN WEEKLY SENTIMENT
#merge weekly mean value sentiment
#Only_weekly_mean_sentiment<- CS_Tweets %>% select(newspaper, created_at.x, topic,weekly_mean_value_sentiment) %>%
#    group_by(newspaper, created_at.x,topic, weekly_mean_value_sentiment) %>%
#    summarise(n= n()) %>%
#    mutate(perc = n/sum(n)) %>%
#    ungroup()

#AA<- left_join(Only_weekly_mean_sentiment,Gr_All_Tweets, 
 #                         by = c("newspaper", "created_at.x", "topic"))

#AA<- AA %>% distinct()

#Comparing mean and wmean
# Gr_All_Tweets_salient <- na.omit(Gr_All_Tweets) #Skipping NaN for topics with 0 retweets 
# cor(Gr_All_Tweets_salient$mean_sentiment_value, Gr_All_Tweets_salient$wmean_sentiment_value) 
# ggplot(Gr_All_Tweets_salient, aes(x=mean_sentiment_value, y=wmean_sentiment_value)) + 
#    geom_point()


#Check if percentage = 1
#Gr_Tweets %>%
#    filter(newspaper.x=="daily_mail") %>%
#   filter(created_at.x == "2019-10-25") %>%
#  summarise(tot = sum(perc))


# To double check that sentiment value mean calculations are correct
# Tweets_S %>%
#    group_by(newspaper.x, created_at.x, topic) %>%
#   filter(newspaper.x=="daily_mail") %>%
#   filter(created_at.x == "2019-08-15") %>%
#  filter(topic == "China") %>%
#    summarise(tot = mean(sentiment_value))


```

```{r, new code for rolling averages for retweets plus divide by 1000}

#summary(Gr_All_Tweets$retweets)

# Multiply all Tweets by 1000 to get coefficients easier to interpret
#Gr_All_Tweets$retweets<- Gr_All_Tweets$retweets/1000 # Not Done

Gr_All_Tweets$newspaper_code <- as.factor(as.character(Gr_All_Tweets$newspaper))

Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(two_day =floor_date(created_at.x, "2 days"), topic, newspaper) %>%
    mutate(retweet_2d_avg = mean(retweets))


Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(three_day =floor_date(created_at.x, "3 days"), topic, newspaper) %>%
    mutate(retweet_3d_avg = mean(retweets))


Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(week =floor_date(created_at.x, "7 days"), topic, newspaper) %>%
    mutate(retweet_7d_avg = mean(retweets))


Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(ten_day =floor_date(created_at.x, "10 days"), topic, newspaper) %>%
    mutate(retweet_10d_avg = mean(retweets))


Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(fifteen_day =floor_date(created_at.x, "15 days"), topic, newspaper) %>%
    mutate(retweet_15d_avg = mean(retweets))



Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(twenty_day =floor_date(created_at.x, "20 days"), topic, newspaper) %>%
    mutate(retweet_20d_avg = mean(retweets))


Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(twentyfive_day =floor_date(created_at.x, "25 days"), topic, newspaper) %>%
    mutate(retweet_25d_avg = mean(retweets))


Gr_All_Tweets<- Gr_All_Tweets %>% 
    group_by(thirty_day =floor_date(created_at.x, "30 days"), topic, newspaper) %>%
    mutate(retweet_30d_avg = mean(retweets))

```

```{r, Save dataset with perc of tweets, mean sentiment and tot retweets}

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

#Gr_All_Tweets<- read.csv("Gr_All_Tweets_Long.csv")

write.csv(Gr_All_Tweets, "Gr_All_Tweets_Long.csv")




```

```{r, Save as wide table }

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

#Gr_All_Tweets<- read.csv("Gr_All_Tweets_Long.csv")

write.csv(Gr_All_Tweets, "Gr_All_Tweets_Long.csv")



# factor into character
Gr_All_Tweets$created_at.x<- as.character(Gr_All_Tweets$created_at.x)

# to transform character into date
Gr_All_Tweets$created_at.x <- strptime(Gr_All_Tweets$created_at.x, format = "%Y-%m-%d", tz = "GMT")

# Posixct into date
Gr_All_Tweets$created_at.x <- as.Date(as.character(as.POSIXct(Gr_All_Tweets$created_at.x)))

#Gr_All_Tweets$round <- as.numeric(Gr_All_Tweets$created_at.x)

Gr_All_Tweets$newspaper<- as.factor(as.character(Gr_All_Tweets$newspaper))
Gr_All_Tweets$topic<- as.factor(as.character(Gr_All_Tweets$topic))



#Gr_All_Tweets <- Gr_All_Tweets %>%                  # Add lagged column
#   group_by(newspaper, topic) %>% 
#dplyr::mutate(lag_retweets = lag(retweets, n = 1, default = NA)) %>%
#    mutate(lag_retweets_3 = lag(retweets, n = 3, default = NA)) %>%
#    mutate(lag_retweets_7 = lag(retweets, n = 7, default = NA)) %>%
#  mutate(lag_retweets_14 = lag(retweets, n = 14, default = NA)) %>%
#    mutate(lead_retweets = lead(retweets, n = 1, default = NA)) %>%
#    mutate(lead_retweets_3 = lead(retweets, n = 3, default = NA)) %>%
#    mutate(lead_retweets_5 = lead(retweets, n = 5, default = NA)) %>%
#    mutate(lead_retweets_7 = lead(retweets, n = 7, default = NA)) %>%
#    mutate(lead_retweets_10 = lead(retweets, n = 10, default = NA)) %>%
#    mutate(lead_retweets_14 = lead(retweets, n = 14, default = NA)) 

# lag_retweets,lag_retweets_3, lag_retweets_7, lag_retweets_14, lead_retweets, lead_retweets_3, lead_retweets_5,lead_retweets_7, lead_retweets_10, lead_retweets_14,

wide_frame <- Gr_All_Tweets %>% 
  dplyr::select(newspaper, created_at.x, topic, retweets, perc,  mean_sentiment_value, retweet_2d_avg, retweet_3d_avg, retweet_7d_avg, retweet_10d_avg, retweet_15d_avg, retweet_20d_avg, retweet_25d_avg, retweet_30d_avg)

# lag_retweets,lag_retweets_3, lag_retweets_7, lag_retweets_14, lead_retweets, lead_retweets_3, lead_retweets_5, lead_retweets_7, lead_retweets_10, lead_retweets_14,

library(tidyr)
# REMOVE WIDE FRAME AS USING ONLY WEEKLY DATA NOW
wide_frame<- pivot_wider(wide_frame, names_from = topic, values_from = c(retweets, perc, mean_sentiment_value, retweet_2d_avg, retweet_3d_avg, retweet_7d_avg, retweet_10d_avg, retweet_15d_avg, retweet_20d_avg, retweet_25d_avg, retweet_30d_avg))


#setwd("/Volumes/GoogleDrive/Other computers/My Laptop/Documents/_University/2021 Cambridge/_Project/_Tracker/Data/Tweets")


# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

write.csv(wide_frame, "Gr_All_Tweets_Wide.csv")





```

```{r, soft vs hard news, eval= FALSE}

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

Gr_All_Tweets<- read.csv("Gr_All_Tweets_Long.csv", stringsAsFactors = TRUE)

Gr_All_Tweets<- Gr_All_Tweets %>%
      mutate(Hard_News = ifelse(topic %in% c("Climate_Change","Covid",
                                             "Hong_Kong","Lockdown","Lockdown_Travel",
                                             "Police_Brutality","Uk_Politics",
                                             "US_Politics"), "1", "0"))
#"Crime",
Gr_All_Tweets$Hard_News<- as.factor(as.character(Gr_All_Tweets$Hard_News))



Salience<- Gr_All_Tweets %>% 
    group_by(created_at.x, newspaper, Hard_News) %>%
    summarise(n= n()) %>%
    group_by(created_at.x, newspaper) %>%
    mutate(perc_Hard_News = 100*n/sum(n))

Salience<- Salience %>% select(!n)

Salience<- pivot_wider(Salience, names_from = Hard_News, values_from = perc_Hard_News)

Salience<- Salience %>% select(!"0")
#Salience_Covid<- Gr_All_Tweets %>% filter(topic == "Uk_Politics")


# Merge two datasets


wide_frame$newspaper<- as.factor(as.character(wide_frame$newspaper))
Salience$newspaper<- as.factor(as.character(Salience$newspaper))

# Posixct into date
wide_frame$created_at.x <- as.Date(as.character(as.POSIXct(wide_frame$created_at.x)))
Salience$created_at.x <- as.Date(as.character(as.POSIXct(Salience$created_at.x)))


Final<- wide_frame %>%
    inner_join(Salience, by = c("created_at.x","newspaper"))

wide_frame<- Final %>% dplyr::rename("Hard_News"="1")



setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

write.csv(wide_frame, "Gr_All_Tweets_Wide.csv")


```

```{r, weekly average variable created by Roberto, eval= FALSE}

# Rename Variables
Gr_All_Tweets <- dplyr::rename(Gr_All_Tweets, created_at = "created_at.x")

#Gr_All_Tweets <- dplyr::rename(Gr_All_Tweets, tot_retweets_topic = tot_retweets)

Gr_All_Tweets$newspaper_code <- as.numeric(Gr_All_Tweets$newspaper)


#####


#Gr_All_Tweets<- Gr_All_Tweets %>%
#    group_by(created_at, topic) %>%
#    mutate(mean_perc_topic = mean(perc)) 




###
library(lubridate)

#Gr_All_Tweets<- Gr_All_Tweets %>% 
#    group_by(week =floor_date(created_at, "7 days"), topic) %>%
#    mutate(perc_week_topic = mean(perc))

#Gr_All_Tweets<- Gr_All_Tweets %>%
#  mutate(Week = as.numeric(created_at))

Gr_All_Tweets$newspaper<- as.character(Gr_All_Tweets$newspaper)
Gr_All_Tweets$topic<- as.character(Gr_All_Tweets$topic)
  



# Rolling average over 7 days

t1 <- Sys.time() # Start timer


Gr_All_Tweets$weekavg <- NA

for(i in 1:length(Gr_All_Tweets$weekavg))
{
Gr_All_Tweets$weekavg[i] <- mean(subset(Gr_All_Tweets,newspaper==Gr_All_Tweets$newspaper[i] & topic==Gr_All_Tweets$topic[i] & created_at > (Gr_All_Tweets$created_at[i] - 7) & created_at < Gr_All_Tweets$created_at[i])$perc, na.rm=T)
}
  

t2 <- Sys.time() # End timer
t2 - t1





```


# Merge You Gov with Topic Tweets

```{r, Upload Yougov data to merge with wide frame}


# Omar Windows upload. 
#setwd("C:/Users/omarh/Documents/GitHub/Satisfaction_Democracy/data-raw/You_Gov/")

setwd("/Users/HammoudG/Documents/GitHub/Satisfaction_Democracy/data-raw/You_Gov/")

You_Gov<- read.csv("You_Gov_Dec_2021.csv")


setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

wide_frame<- read.csv("Gr_All_Tweets_Wide.csv")



```

```{r, preparation You Gov Data for merging Updated}


You_Gov$CAM_demtrack_satisfied<- as.factor(You_Gov$CAM_demtrack_satisfied)
You_Gov$pastvote_EURef<-as.factor(You_Gov$pastvote_EURef)
You_Gov$pastvote_ge_2019<- as.factor(You_Gov$pastvote_ge_2019)
You_Gov$profile_newspaper_readership<- as.factor(You_Gov$profile_newspaper_readership)
You_Gov$voted_ge_2019<- as.factor(You_Gov$voted_ge_2019)

# factor into character
You_Gov$starttime<- as.character(You_Gov$starttime)

# to transform character into date
You_Gov$starttime <- strptime(You_Gov$starttime, format = "%Y-%m-%d %H:%M:%OS", tz = "GMT")

# Posixct into date
You_Gov$starttime <- as.Date(as.character(as.POSIXct(You_Gov$starttime)))


# Filter for valid values 
You_Gov <- You_Gov %>%
    dplyr::filter(CAM_demtrack_satisfied %in% c("1", "2", "3", "4")) %>%
    dplyr::filter(pastvote_EURef %in% c("1", "2", "3")) %>%
  #  dplyr::filter(pastvote_ge_2019 %in% c("1", "2", "3", "4")) %>%
        dplyr::filter(profile_newspaper_readership %in% c("2", "3", "5","6", "7", "8","9","10","16"))
    

# TASK 1. To change pastvote_ge_2019 to 95 when voted_ge_2019 == 2
You_Gov$pastvote_ge_2019<- as.numeric(You_Gov$pastvote_ge_2019)
You_Gov$voted_ge_2019<- as.numeric(You_Gov$voted_ge_2019)

# NAs as 0s
You_Gov$pastvote_ge_2019[is.na(You_Gov$pastvote_ge_2019)]<- 0

# Replace cell value based on value other cell in different column
You_Gov<- You_Gov %>%
  mutate(pastvote_ge_2019= replace(pastvote_ge_2019, voted_ge_2019 == 2, 95))

You_Gov$pastvote_ge_2019<- as.factor(You_Gov$pastvote_ge_2019)
You_Gov$voted_ge_2019<- as.factor(You_Gov$voted_ge_2019)

# To double check that replacement was correctly done
#You_Gov %>% select(voted_ge_2019, pastvote_ge_2019) %>% filter(voted_ge_2019 == "2")

#summary(as.factor(You_Gov$voted_ge_2019))
#summary(as.factor(You_Gov$pastvote_ge_2019))

library(dplyr)
# Task 2. Assign to each newspaper a name, and to those who don't read newspapers then BBC News

# Rename all relevant factors to consider in model
You_Gov<- You_Gov %>%
     dplyr::mutate(CAM_demtrack_satisfied = fct_recode(CAM_demtrack_satisfied, 
                                                "Satisfied" ="1", 
                                                "Satisfied" = "2", 
                                                "Not Satisfied" = "3", 
                                                "Not Satisfied" = "4")) %>%
    dplyr::mutate(pastvote_EURef = fct_recode(pastvote_EURef, 
                                        "Voted Remain" = "1",
                                        "Voted Leave" = "2",
                                        "Did not vote" = "3")) %>%
    dplyr::mutate(profile_newspaper_readership = fct_recode(profile_newspaper_readership, 
                                                          "daily_mail" = "2",
                                                          "daily_mirror" = "3", 
                                                          "sun" = "5",
                                                          "telegraph"= "6",
                                                          "ft" = "7", 
                                                          "guardian" = "8",
                                                          "independent" = "9",
                                                          "times" = "10",
                                                          "bbc_news"= "16")) %>%
    dplyr::mutate(pastvote_ge_2019 = fct_recode(pastvote_ge_2019, 
                                                "Conservatives" = "1", 
                                                "Labour" = "2", 
                                                "Liberal Democrats" = "3", 
                                                "Scottish National Party" = "4",
                                                "Did not vote" = "95")) 
library(gdata)
You_Gov<- You_Gov %>%  drop.levels(You_Gov$profile_newspaper_readership)


# Double check that 
#summary(You_Gov$profile_newspaper_readership)
#summary(as.factor(wide_frame$newspaper))


# Re factor educational level

You_Gov$profile_education_level<- as.factor(as.character(You_Gov$profile_education_level))

You_Gov<- You_Gov %>%
  mutate(profile_education_level = fct_recode(profile_education_level, 
                                                "No_education" ="1", 
                                                "secondary_education" = "2",
                                                "secondary_education"= "3",
                                                "secondary_education" ="4", 
                                                "secondary_education" = "5",
                                                "secondary_education"= "6",
                                                "secondary_education" ="7", 
                                                "secondary_education" = "8",
                                                "secondary_education"= "9",
                                                "secondary_education" ="10", 
                                                "secondary_education" = "11",
                                                "other_higher_qualification"= "12",
                                                "other_higher_qualification"= "13",                                                                                "other_higher_qualification" = "14",
                                                "University_level_education"= "15",
                                                "University_level_education"= "16",
                                                "University_level_education"= "17",
                                                "other_higher_qualification"= "18",
                                                "don't_know"= "19",
                                                "prefer_not_to_say"= "20"))



# Control for education, political attention and other 
# political_attention
# profile_education

```

```{r, merge data wide frame with yougov data and write down}

# transform data wide_frame for merging

wide_frame<- wide_frame %>%
    dplyr::rename("created_at"="created_at.x")

wide_frame$newspaper<- as.factor(as.character(wide_frame$newspaper))

# Posixct into date
wide_frame$created_at <- as.Date(as.character(as.POSIXct(wide_frame$created_at)))

# to double check dates they match 

You_Gov<- You_Gov %>%
  dplyr::rename("newspaper" = "profile_newspaper_readership",
            "created_at" = "starttime")



#summary(wide_frame$newspaper)
#summary(You_Gov$newspaper)

### UNDERSTAND WHY MERGIN IS MULTIPLYING CASES

library(dplyr)
CS_Tweets<-wide_frame  %>%
            dplyr::inner_join(You_Gov, by = c("newspaper", "created_at"))

#CS_Tweets<- You_Gov %>%
 #           dplyr::inner_join(wide_frame , by = c("newspaper", "created_at"))



# If done correctly then write down

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")


write.csv(CS_Tweets, "combined_survey_tweets.csv")

```

## Final step to produce dataset with new variables from workshop and re-factored levels

```{r, upload data with survey and tweets}
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")


CS_Tweets<- read.csv("combined_survey_tweets.csv", stringsAsFactors = TRUE)



```

```{r, create new variables of demtrack and lastvote}



# lastvote recreate
CS_Tweets <- CS_Tweets %>%
    mutate(pastvote_ge_2019 = fct_recode(pastvote_ge_2019, "Other" = "5", "Other" = "6", "Other" = "7",
                                          "Other" = "8", "Other" = "9"))

CS_Tweets$pastvote_2017<- as.factor(as.character(CS_Tweets$pastvote_2017))

CS_Tweets <- CS_Tweets %>%
    mutate(pastvote_2017 = fct_recode(pastvote_2017, "Other" = "5", "Other" = "6", "Other" = "7",
                                          "Other" = "8", "Other" = "9", 
                                                "Conservatives" = "1", 
                                                "Labour" = "2", 
                                                "Liberal Democrats" = "3", 
                                                "Scottish National Party" = "4", "Other" = "98", "0"="99"))

# HERE IT WORKED
CS_Tweets<- CS_Tweets %>%
  mutate(lastvote= replace(pastvote_ge_2019, pastvote_ge_2019 == "0", pastvote_2017))


# filter out NAs in lastvote
CS_Tweets<- CS_Tweets %>%
    filter(lastvote != "NA") %>%
    filter(lastvote != "0") %>%
    drop.levels(CS_Tweets$lastvote)

#summary(CS_Tweets$lastvote)

#Dataset_to_check<- CS_Tweets %>% select(pastvote_2017, pastvote_ge_2019, lastvote)


```

```{r, refactor all levels for regressions}

CS_Tweets$CAM_demtrack_satisfied<- as.factor(as.character(CS_Tweets$CAM_demtrack_satisfied))

CS_Tweets$created_at <- as.Date(as.character(as.POSIXct(CS_Tweets$created_at)))

CS_Tweets$newspaper<- as.factor(as.character(CS_Tweets$newspaper))

CS_Tweets$pastvote_EURef<- as.factor(as.character(CS_Tweets$pastvote_EURef))

CS_Tweets$profile_gender<- as.factor(as.character(CS_Tweets$profile_gender))

CS_Tweets$profile_GOR<- as.factor(as.character(CS_Tweets$profile_GOR))


CS_Tweets<- CS_Tweets %>%
  mutate(profile_gender = fct_recode(profile_gender, 
                                                "Male" ="1", 
                                                "Female" = "2"))


CS_Tweets<- CS_Tweets %>%
  mutate(profile_GOR = fct_recode(profile_GOR, 
                                                "North_East" ="1", 
                                                "North_West" = "2",
                                                "Yorkshire_and_the_Humber"= "3",
                                                "East_Midlands" ="4", 
                                                "West_Midlands" = "5",
                                                "East_of_England"= "6",
                                                "London" ="7", 
                                                "South_East" = "8",
                                                "South_West"= "9",
                                                "Wales" ="10", 
                                                "Scotland" = "11",
                                                "Northern_Ireland"= "12"))

#summary(CS_Tweets$profile_GOR)

CS_Tweets$profile_gross_household<- as.factor(as.character(CS_Tweets$profile_gross_household))

CS_Tweets<- CS_Tweets %>%
  mutate(profile_gross_household = fct_recode(profile_gross_household, 
                                                          "under £5,000 per year"= "1", 
                                                          "£5,000 to £9,999 per year"= "2",
                                                          "£10,000 to £14,999 per year"= "3",
                                                          "£15,000 to £19,999 per year"= "4",
                                                          "£20,000 to £24,999 per year"= "5",
                                                          "£25,000 to £29,999 per year"= "6",
                                                          "£30,000 to £34,999 per year"= "7",
                                                          "£35,000 to £39,999 per year"= "8",
                                                          "£40,000 to £44,999 per year"= "9",
                                                          "£45,000 to £49,999 per year"= "10",
                                                          "£50,000 to £59,999 per year"= "11",
                                                          "£60,000 to £69,999 per year"= "12",
                                                          "£70,000 to £99,999 per year"= "13",
                                                          "£100,000 to £149,999 per year"= "14",
                                                          "£150,000 and over"= "15", 
                                                          "Don't know"= "16",
                                                           "Prefer_not_to_answer" = "17"))
#summary(CS_Tweets$profile_gross_household)


CS_Tweets$pastvote_EURef= relevel(CS_Tweets$pastvote_EURef, ref= "Did not vote")

CS_Tweets$profile_gender= relevel(CS_Tweets$profile_gender, ref= "Female")

CS_Tweets$lastvote = relevel(CS_Tweets$lastvote, ref = "Did not vote")

CS_Tweets$newspaper= relevel(CS_Tweets$newspaper, ref = "bbc_news")


# Binary Dependent Variable for regressions
CS_Tweets$DEM_SAT<- NA

CS_Tweets$DEM_SAT[CS_Tweets$CAM_demtrack_satisfied== "Satisfied"]<- 1

CS_Tweets$DEM_SAT[CS_Tweets$CAM_demtrack_satisfied== "Not Satisfied"]<- 0



```

```{r, save dataset CS_Tweets}


setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")

write.csv(CS_Tweets,"combined_survey_tweets.csv")

#glimpse(CS_Tweets)
```

UNTIL HERE DATA ACTUALLY USED + assignment topics to adjust later

## Merge with economic and covid data

```{r, upload data, eval= FALSE}

#Xavi Path
#Mod_Dataset<- read.csv("/Volumes/GoogleDrive/Other computers/My Laptop/Documents/_University/2021 Cambridge/_Project/_Tracker/Data/combined_survey_tweets_february_11.csv", stringsAsFactors = TRUE)

# Omar Upload. Comment Out to run your own, 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

#setwd("C:/Users/omarh//OneDrive - London School of Economics/Satisfaction_Democracy_Project/")


Mod_Dataset<- read.csv("combined_survey_tweets.csv", stringsAsFactors = TRUE)

#summary(Mod_Dataset)
```

```{r, check factors reference levels, eval= FALSE}

#Mod_Dataset$lastvote = relevel(Mod_Dataset$lastvote, ref = "Did not vote")

Mod_Dataset$profile_gross_household<- as.character(Mod_Dataset$profile_gross_household)


#Transform income into integer
Mod_Dataset2 <- Mod_Dataset %>% 
 mutate(profile_gross_household = recode (profile_gross_household, 
    "under £5,000 per year"= "1", 
    "£5,000 to £9,999 per year"= "2",
    "£10,000 to £14,999 per year"= "3",
    "£15,000 to £19,999 per year"= "4",
    "£20,000 to £24,999 per year"= "5",
    "£25,000 to £29,999 per year"= "6",
    "£30,000 to £34,999 per year"= "7",
    "£35,000 to £39,999 per year"= "8",
    "£40,000 to £44,999 per year"= "9",
    "£45,000 to £49,999 per year"= "10",
    "£50,000 to £59,999 per year"= "11",
    "£60,000 to £69,999 per year"= "12",
    "£70,000 to £99,999 per year"= "13",
    "£100,000 to £149,999 per year"= "14",
    "£150,000 and over"= "15", 
    "Don't know"= "NA",
    "Prefer_not_to_answer" = "NA"))

Mod_Dataset2$profile_gross_household<- as.numeric(Mod_Dataset2$profile_gross_household)

#Standardizing numerical (salience and sentiment)
#install.packages("standardize")
library(standardize)
Mod_Dataset2$profile_gross_household<- scale(Mod_Dataset2$profile_gross_household)
Mod_Dataset2$weekavg_Climate_Change <- scale(Mod_Dataset2$weekavg_Climate_Change)
Mod_Dataset2$weekavg_Covid <- scale(Mod_Dataset2$weekavg_Covid)
Mod_Dataset2$weekavg_Crime <- scale(Mod_Dataset2$weekavg_Crime)
Mod_Dataset2$weekavg_Daily_News <- scale(Mod_Dataset2$weekavg_Daily_News)
Mod_Dataset2$weekavg_Enterteinment <- scale(Mod_Dataset2$weekavg_Enterteinment)
Mod_Dataset2$weekavg_Football <- scale(Mod_Dataset2$weekavg_Football)
Mod_Dataset2$weekavg_Hong_Kong <- scale(Mod_Dataset2$weekavg_Hong_Kong)
Mod_Dataset2$weekavg_Lockdown <- scale(Mod_Dataset2$weekavg_Lockdown)
Mod_Dataset2$weekavg_Lockdown_Travel <- scale(Mod_Dataset2$weekavg_Lockdown_Travel)
Mod_Dataset2$weekavg_Police_Brutality <- scale(Mod_Dataset2$weekavg_Police_Brutality)
Mod_Dataset2$weekavg_Puppy_News <- scale(Mod_Dataset2$weekavg_Puppy_News)
Mod_Dataset2$weekavg_Royal_Family <- scale(Mod_Dataset2$weekavg_Royal_Family)
Mod_Dataset2$weekavg_Uk_Politics <- scale(Mod_Dataset2$weekavg_Uk_Politics)
Mod_Dataset2$weekavg_US_Politics <- scale(Mod_Dataset2$weekavg_US_Politics)
Mod_Dataset2$weekavg_Weather <- scale(Mod_Dataset2$weekavg_Weather)

Mod_Dataset2$weekly_mean_sentiment_R_Climate_Change <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Climate_Change)
Mod_Dataset2$weekly_mean_sentiment_R_Covid <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Covid)
Mod_Dataset2$weekly_mean_sentiment_R_Crime <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Crime)
Mod_Dataset2$weekly_mean_sentiment_R_Daily_News <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Daily_News)
Mod_Dataset2$weekly_mean_sentiment_R_Enterteinment <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Enterteinment)
Mod_Dataset2$weekly_mean_sentiment_R_Football <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Football)
Mod_Dataset2$weekly_mean_sentiment_R_Hong_Kong <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Hong_Kong)
Mod_Dataset2$weekly_mean_sentiment_R_Lockdown <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Lockdown)
Mod_Dataset2$weekly_mean_sentiment_R_Lockdown_Travel <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Lockdown_Travel)
Mod_Dataset2$weekly_mean_sentiment_R_Police_Brutality <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Police_Brutality)
Mod_Dataset2$weekly_mean_sentiment_R_Puppy_News <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Puppy_News)
Mod_Dataset2$weekly_mean_sentiment_R_Royal_Family <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Royal_Family)
Mod_Dataset2$weekly_mean_sentiment_R_Uk_Politics <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Uk_Politics)
Mod_Dataset2$weekly_mean_sentiment_R_US_Politics <- scale(Mod_Dataset2$weekly_mean_sentiment_R_US_Politics)
Mod_Dataset2$weekly_mean_sentiment_R_Weather <- scale(Mod_Dataset2$weekly_mean_sentiment_R_Weather)

#Week as factor
Mod_Dataset2$week <- as.factor(Mod_Dataset2$week)
#Week as ticker
Mod_Dataset2$week <- as.numeric(Mod_Dataset2$week)

```

```{r, date into date format, eval= FALSE}
# factor into character
Mod_Dataset2$created_at<- as.character(Mod_Dataset2$created_at)

# to transform character into date
Mod_Dataset2$created_at <- strptime(Mod_Dataset2$created_at, format = "%Y-%m-%d", tz = "GMT")

# Posixct into date
Mod_Dataset2$created_at <- as.Date(as.character(as.POSIXct(Mod_Dataset2$created_at)))


```

## Merge economic data

```{r, merge econ data, eval= FALSE}

# upload monthly economic data
# for mac
setwd("/Users/HammoudG/Documents/GitHub/Satisfaction_Democracy/data-raw/Economic_data/")

ECON<- read.csv("UK_monthly_economic_data.csv")


# factor into character
ECON$date<- as.character(ECON$date)

# to transform character into date
ECON$date <- strptime(ECON$date, format = "%Y-%m-%d", tz = "GMT")

# Posixct into date
ECON$date <- as.Date(as.character(as.POSIXct(ECON$date)))


# MERGE with Mod_Dataset
Mod_Dataset2<- Mod_Dataset2 %>%
    group_by(month = lubridate::floor_date(created_at, "month")) %>%
    left_join(ECON, by = c("month"="date")) %>%
    ungroup()

# Upload Covid daily Data

# Omar Upload. Comment Out to run your own, 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

COVID<- read.csv("owid-covid-data.csv")

COVID$location<- as.factor(as.character(COVID$location))


COVID<- COVID %>%
    select(location, date, new_deaths:new_people_vaccinated_smoothed_per_hundred) %>%
    filter(location == "United Kingdom")

# factor into character
COVID$date<- as.character(COVID$date)

# to transform character into date
COVID$date <- strptime(COVID$date, format = "%Y-%m-%d", tz = "GMT")

# Posixct into date
COVID$date <- as.Date(as.character(as.POSIXct(COVID$date)))


# MERGE COVID DATA WITH REGRESSION DATASET

Mod_Dataset2<- Mod_Dataset2 %>%
    left_join(COVID, by = c("created_at"="date")) 



# To save new dataset with covid and economic data for regression modelling 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

write.csv(Mod_Dataset2, "combined_survey_tweets.csv")

```




# Assign Sentiment to topics

```{r, upload data for regression}

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")

Mod_Dataset<- read.csv("combined_survey_tweets.csv", stringsAsFactors = TRUE)



Mod_Dataset$lastvote = relevel(Mod_Dataset$lastvote, ref = "Did not vote")

Mod_Dataset$profile_education_level = relevel(Mod_Dataset$profile_education_level, ref = "No_education")



Mod_Dataset<- Mod_Dataset %>%
  mutate(newspaper_ideology =  fct_recode(newspaper, 
                             "None" = "bbc_news", 
                             "Right Wing Newspaper"= "daily_mail",
                             "FT" ="ft",
                             "Left Wing Newspaper" ="independent",
                             "Right Wing Newspaper" ="sun",
                              "Right Wing Newspaper" = "telegraph",
                             "Right Wing Newspaper" ="times",
                             "Left Wing Newspaper" = "guardian",
                             "Left Wing Newspaper" = "daily_mirror" ))



#Mod_Dataset1<- Mod_Dataset %>% 
#              filter(newspaper_ideology != "None") %>%
#               filter(newspaper_ideology != "FT") %>%
#              drop.levels()

# Multiply retweets by 1,000 to ease interpretation coefficients

#Mod_Dataset$retweets_Climate_Change<- Mod_Dataset$retweets_Climate_Change*1000
#Mod_Dataset$retweets_Covid<- Mod_Dataset$retweets_Covid*1000
#Mod_Dataset$retweets_Crime<- Mod_Dataset$retweets_Crime*1000
#Mod_Dataset$retweets_Daily_News<- Mod_Dataset$retweets_Daily_News*1000
#Mod_Dataset$retweets_Enterteinment<- Mod_Dataset$retweets_Enterteinment*1000
#Mod_Dataset$retweets_Football<- Mod_Dataset$retweets_Football*1000
#Mod_Dataset$retweets_Hong_Kong<- Mod_Dataset$retweets_Hong_Kong*1000
#Mod_Dataset$retweets_Lockdown<- Mod_Dataset$retweets_Lockdown*1000
#Mod_Dataset$retweets_Lockdown_Travel<- Mod_Dataset$retweets_Lockdown_Travel*1000
#Mod_Dataset$retweets_Police_Brutality<- Mod_Dataset$retweets_Police_Brutality*1000
#Mod_Dataset$retweets_Puppy_News<- Mod_Dataset$retweets_Puppy_News*1000
#Mod_Dataset$retweets_Royal_Family<- Mod_Dataset$retweets_Royal_Family*1000
#Mod_Dataset$retweets_Uk_Politics<- Mod_Dataset$retweets_Uk_Politics*1000
#Mod_Dataset$retweets_US_Politics<- Mod_Dataset$retweets_US_Politics*1000
#Mod_Dataset$retweets_Weather<- Mod_Dataset$retweets_Weather*1000

```

```{r, normalise retweets by substituting NAs to 0}


# Drop NA, where no Retweets
#Mod_Dataset<- Mod_Dataset %>%
#  drop_na(6:35)

#replace all NA values with zero
#Mod_Dataset_Sel <- Mod_Dataset_Sel %>% replace(is.na(.), 0)

# Replace in selected columns NAs with 0S
Mod_Dataset <- Mod_Dataset %>% mutate(
                    retweets_Climate_Change = ifelse(is.na(retweets_Climate_Change), 0, retweets_Climate_Change),
                    retweets_Covid = ifelse(is.na(retweets_Covid), 0, retweets_Covid),
                    retweets_Crime = ifelse(is.na(retweets_Crime), 0, retweets_Crime),
                    retweets_Daily_News = ifelse(is.na(retweets_Daily_News), 0, retweets_Daily_News),
                    retweets_Enterteinment = ifelse(is.na(retweets_Enterteinment), 0, retweets_Enterteinment),
                    retweets_Football = ifelse(is.na(retweets_Football), 0, retweets_Football),
                    retweets_Hong_Kong = ifelse(is.na(retweets_Hong_Kong), 0, retweets_Hong_Kong),
                    retweets_Lockdown = ifelse(is.na(retweets_Lockdown), 0, retweets_Lockdown),
                    retweets_Lockdown_Travel = ifelse(is.na(retweets_Lockdown_Travel), 0,
                                                      retweets_Lockdown_Travel),
                    retweets_Police_Brutality = ifelse(is.na(retweets_Police_Brutality), 0,
                                                       retweets_Police_Brutality),
                    retweets_Puppy_News = ifelse(is.na(retweets_Puppy_News), 0, retweets_Puppy_News),
                    retweets_Royal_Family = ifelse(is.na(retweets_Royal_Family), 0, retweets_Royal_Family),
                    retweets_Uk_Politics = ifelse(is.na(retweets_Uk_Politics), 0, retweets_Uk_Politics),
                    retweets_US_Politics = ifelse(is.na(retweets_US_Politics), 0, retweets_US_Politics),
                    retweets_Weather = ifelse(is.na(retweets_Weather), 0, retweets_Weather))


#define Min-Max normalization function
min_max_norm <- function(x) {
    (x - min(x)) / (max(x) - min(x))
  }

# Select only variables to normalise
Mod_Dataset_Sel<- Mod_Dataset %>% select(X,retweets_Climate_Change:retweets_Weather)

#apply Min-Max normalization to columns of my dataset substituting the old values
Mod_Dataset[6:20] <- as.data.frame(lapply(Mod_Dataset_Sel[2:16], min_max_norm))


```

```{r, Upload data produced up here of All_Tweets_Classified_Sentiment to classify topics according to mean sentiment value, eval = FALSE}

# Omar Upload. Comment Out to run your own, 
setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

# Omar Windows upload. 
#setwd("C:/Users/omarh/OneDrive - London School of Economics/Satisfaction_Democracy_Project")


# Here to upload data
CS_Tweets<- read.csv("All_Tweets_Classified_Sentiment.csv", stringsAsFactors = FALSE, encoding = "UTF-8")


CS_Tweets$topic<- as.factor(as.character(CS_Tweets$topic))


# Remove duplicate columns
CS_Tweets <- CS_Tweets %>%
    select(-X)



# Rename Variables
CS_Tweets <- dplyr::rename(CS_Tweets,  sentiment_value = value, newspaper = newspaper.x)


# to transform character into date

CS_Tweets$created_at.x <- strptime(CS_Tweets$created_at.x, format = "%Y-%m-%dT%H:%M:%OS", tz = "GMT")


# Posixct into date
CS_Tweets$created_at.x <- as.Date(as.character(as.POSIXct(CS_Tweets$created_at.x)))



#setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

#write.csv(CS_Tweets, "All_Tweets_Classified_Sentiment.csv")

```

```{r, identify topics as negative vs neutral/positive, eval = FALSE}

# Calculate mean sentiment of each topic
CS_Tweets_Sentiment<- Tweets %>% 
  group_by(topic) %>%
  #drop_na() %>%
  summarise(mean= mean(sentiment_value))

# Calculate mean of all topics for period concerned
CS_Tweets_Sentiment_mean<- CS_Tweets_Sentiment %>%
    select(mean) %>%
    summarise(mean=mean(mean))

# -0.7264089

CS_Tweets_Sentiment<- CS_Tweets_Sentiment %>%
      mutate(Type_Sentiment = ifelse(mean < -0.7489469, "Negative_Topic", "Positive/Neutral_Topic"))
  

```

```{r, assign topics sentiments, eval= FALSE}

Tweets<- Tweets %>%
  mutate(Type_Sentiment = ifelse(topic %in% c("Climate_Change", 
                                              "Crime",
                                              "Violence",
                                              "US_Politics"), 
                                 "Negative_Topic", "Neutral/Positive_Topic"))

#summary(as.factor(Tweets$Type_Sentiment))

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

write.csv(Tweets, "All_Tweets_Classified_Sentiment.csv")


```

```{r, rename topics}
Tweets<- Tweets %>% mutate(topic =  fct_recode(topic, 
                             "Violence" = "Police_Brutality", 
                             "Covid"= "Lockdown",
                             "Covid"= "Lockdown_Consequences",
                             "Daily_News"="Hong_Kong",
                             "Sports"="Football"))
head(Tweets)

# remove columns not used
Tweets<- Tweets[,-c(1:2)]
Tweets<- Tweets[,-1]
#summary(Tweets$topic)

setwd("/Users/HammoudG/OneDrive - London School of Economics/Satisfaction_Democracy_Project/")

write.csv(Tweets, "All_Tweets_Classified_Sentiment.csv")


```